In [1]:
import pandas as pd
import sklearn
import pickle
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import plotly.express as px
from pprint import pprint
import spacy #used for lemmatization
nltk.download('stopwords')
import pyLDAvis.gensim_models
import gensim.corpora as corpora
from gensim.models import TfidfModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import ldaseqmodel
import zipfile
import warnings
warnings.filterwarnings('ignore')
stop_words=set(nltk.corpus.stopwords.words('english'))


!python -m spacy download en_core_web_sm
pd.set_option('display.max_colwidth', None) 
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
/home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages/scipy/sparse/sparsetools.py:21: DeprecationWarning: `scipy.sparse.sparsetools` is deprecated!
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 28.1 MB/s eta 0:00:0000:010:01m
Requirement already satisfied: spacy<3.5.0,>=3.4.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from en-core-web-sm==3.4.1) (3.4.3)
Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (59.4.0)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.10)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.26.0)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.8)
Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.10.1)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.4.5)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.8)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.3.0)
Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (21.3)
Requirement already satisfied: typer<0.8.0,>=0.3.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.7.0)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.10.2)
Requirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.3)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.3)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.0.9)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.62.3)
Requirement already satisfied: pathy>=0.3.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.9.0)
Requirement already satisfied: numpy>=1.15.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.20.3)
Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.1.5)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.7)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from packaging>=20.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.0.6)
Requirement already satisfied: smart-open<6.0.0,>=5.2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pathy>=0.3.5->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (5.2.1)
Requirement already satisfied: typing-extensions>=4.1.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (4.4.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (1.26.8)
Requirement already satisfied: charset-normalizer~=2.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.8)
Requirement already satisfied: idna<4,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (3.1)
Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2021.10.8)
Requirement already satisfied: blis<0.8.0,>=0.7.8 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.7.9)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from thinc<8.2.0,>=8.1.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (0.0.3)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from typer<0.8.0,>=0.3.0->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (8.0.3)
Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from jinja2->spacy<3.5.0,>=3.4.0->en-core-web-sm==3.4.1) (2.0.1)
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.1
WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
In [3]:
#Please run these pip install if this is the first time you run this notebook and you don't have the package below installed.
# !pip install wordcloud
# !pip install gensim
# !pip install -U spacy[cuda113]
# !pip install pyLDAvis
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting wordcloud
  Downloading wordcloud-1.8.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (458 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 459.0/459.0 KB 11.2 MB/s eta 0:00:0000:01
Requirement already satisfied: numpy>=1.6.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from wordcloud) (1.20.3)
Requirement already satisfied: pillow in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from wordcloud) (9.0.1)
Requirement already satisfied: matplotlib in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from wordcloud) (3.5.0)
Requirement already satisfied: python-dateutil>=2.7 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: kiwisolver>=1.0.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from matplotlib->wordcloud) (1.3.2)
Requirement already satisfied: pyparsing>=2.2.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from matplotlib->wordcloud) (3.0.6)
Requirement already satisfied: fonttools>=4.22.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from matplotlib->wordcloud) (4.28.2)
Requirement already satisfied: cycler>=0.10 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from matplotlib->wordcloud) (21.3)
Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
Installing collected packages: wordcloud
Successfully installed wordcloud-1.8.2.2
WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting gensim
  Downloading gensim-4.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.1/24.1 MB 20.7 MB/s eta 0:00:0000:0100:01
Requirement already satisfied: scipy>=0.18.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from gensim) (1.5.3)
Requirement already satisfied: numpy>=1.17.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from gensim) (1.20.3)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.2.0-py3-none-any.whl (58 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.6/58.6 KB 15.6 MB/s eta 0:00:00
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.2.0 smart-open-6.2.0
WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting spacy[cuda113]
  Downloading spacy-3.4.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.6/6.6 MB 33.7 MB/s eta 0:00:00:00:0100:01
Collecting spacy-legacy<3.1.0,>=3.0.10
  Downloading spacy_legacy-3.0.10-py2.py3-none-any.whl (21 kB)
Requirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy[cuda113]) (3.0.3)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21 kB)
Collecting pathy>=0.3.5
  Downloading pathy-0.9.0-py3-none-any.whl (47 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 47.8/47.8 KB 11.5 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.15.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy[cuda113]) (1.20.3)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.3-py3-none-any.whl (9.3 kB)
Collecting typer<0.8.0,>=0.3.0
  Downloading typer-0.7.0-py3-none-any.whl (38 kB)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy[cuda113]) (2.26.0)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 181.6/181.6 KB 5.4 MB/s eta 0:00:00
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36 kB)
Collecting wasabi<1.1.0,>=0.9.1
  Downloading wasabi-0.10.1-py3-none-any.whl (26 kB)
Requirement already satisfied: packaging>=20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy[cuda113]) (21.3)
Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy[cuda113]) (59.4.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from spacy[cuda113]) (4.62.3)
Collecting pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4
  Downloading pydantic-1.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.6/13.6 MB 22.8 MB/s eta 0:00:0000:0100:01
Collecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (492 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 492.6/492.6 KB 4.2 MB/s eta 0:00:0000:01
Collecting preshed<3.1.0,>=3.0.2
  Downloading preshed-3.0.8-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 130.8/130.8 KB 30.2 MB/s eta 0:00:00
Collecting thinc<8.2.0,>=8.1.0
  Downloading thinc-8.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (819 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 819.8/819.8 KB 15.4 MB/s eta 0:00:0000:01
Collecting cupy-cuda113<12.0.0,>=5.0.0b4
  Downloading cupy_cuda113-10.6.0-cp38-cp38-manylinux1_x86_64.whl (77.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.9/77.9 MB 25.3 MB/s eta 0:00:0000:0100:01
Collecting fastrlock>=0.5
  Downloading fastrlock-0.8.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_24_x86_64.whl (48 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 48.9/48.9 KB 11.3 MB/s eta 0:00:00
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from packaging>=20.0->spacy[cuda113]) (3.0.6)
Collecting smart-open<6.0.0,>=5.2.1
  Downloading smart_open-5.2.1-py3-none-any.whl (58 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.6/58.6 KB 11.3 MB/s eta 0:00:00
Collecting typing-extensions>=4.1.0
  Using cached typing_extensions-4.4.0-py3-none-any.whl (26 kB)
Requirement already satisfied: charset-normalizer~=2.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy[cuda113]) (2.0.8)
Requirement already satisfied: idna<4,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy[cuda113]) (3.1)
Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy[cuda113]) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy[cuda113]) (1.26.8)
Collecting blis<0.8.0,>=0.7.8
  Downloading blis-0.7.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.2/10.2 MB 28.9 MB/s eta 0:00:0000:010:01
Collecting confection<1.0.0,>=0.0.1
  Downloading confection-0.0.3-py3-none-any.whl (32 kB)
Requirement already satisfied: click<9.0.0,>=7.1.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from typer<0.8.0,>=0.3.0->spacy[cuda113]) (8.0.3)
Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from jinja2->spacy[cuda113]) (2.0.1)
Installing collected packages: wasabi, fastrlock, cymem, typing-extensions, typer, spacy-loggers, spacy-legacy, smart-open, murmurhash, langcodes, cupy-cuda113, catalogue, blis, srsly, pydantic, preshed, pathy, confection, thinc, spacy
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.0.0
    Uninstalling typing_extensions-4.0.0:
      Successfully uninstalled typing_extensions-4.0.0
  Attempting uninstall: smart-open
    Found existing installation: smart-open 6.2.0
    Uninstalling smart-open-6.2.0:
      Successfully uninstalled smart-open-6.2.0
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.0.1 requires botocore<1.22.9,>=1.22.8, but you have botocore 1.24.19 which is incompatible.
Successfully installed blis-0.7.9 catalogue-2.0.8 confection-0.0.3 cupy-cuda113-10.6.0 cymem-2.0.7 fastrlock-0.8.1 langcodes-3.3.0 murmurhash-1.0.9 pathy-0.9.0 preshed-3.0.8 pydantic-1.10.2 smart-open-5.2.1 spacy-3.4.3 spacy-legacy-3.0.10 spacy-loggers-1.0.3 srsly-2.4.5 thinc-8.1.5 typer-0.7.0 typing-extensions-4.4.0 wasabi-0.10.1
WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 14.4 MB/s eta 0:00:00:00:01
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Installing backend dependencies ... done
  Preparing metadata (pyproject.toml) ... done
Requirement already satisfied: jinja2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (3.0.3)
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Requirement already satisfied: sklearn in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (0.0)
Requirement already satisfied: joblib in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (1.1.0)
Requirement already satisfied: setuptools in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (59.4.0)
Requirement already satisfied: future in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (0.18.2)
Requirement already satisfied: scipy in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (1.5.3)
Requirement already satisfied: numexpr in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (2.7.3)
Requirement already satisfied: pandas>=1.2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (1.3.4)
Requirement already satisfied: numpy>=1.20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (1.20.3)
Requirement already satisfied: scikit-learn in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (1.0.1)
Requirement already satisfied: gensim in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pyLDAvis) (4.2.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pandas>=1.2.0->pyLDAvis) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from pandas>=1.2.0->pyLDAvis) (2021.3)
Requirement already satisfied: smart-open>=1.8.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from gensim->pyLDAvis) (5.2.1)
Requirement already satisfied: MarkupSafe>=2.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from jinja2->pyLDAvis) (2.0.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from scikit-learn->pyLDAvis) (3.0.0)
Requirement already satisfied: six>=1.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.8/site-packages (from python-dateutil>=2.7.3->pandas>=1.2.0->pyLDAvis) (1.16.0)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (pyproject.toml) ... done
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136882 sha256=11731c38c9507163f5700dc0346d896dbc4d45f5e529c99a9b22101d4b17816c
  Stored in directory: /home/ec2-user/.cache/pip/wheels/90/61/ec/9dbe9efc3acf9c4e37ba70fbbcc3f3a0ebd121060aa593181a
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.17 pyLDAvis-3.3.1
WARNING: You are using pip version 22.0.4; however, version 22.3.1 is available.
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.
In [5]:
#Read in the dataset
# import zipfile
# with zipfile.ZipFile('complaints-2022-10-23_09_54.csv.zip') as zip:
#     with zip.open('complaints-2022-10-23_09_54.csv') as myZip:
#         df = pd.read_csv(myZip) 
df=pd.read_csv('complaints-2022-10-23_09_54.csv')
In [7]:
#Keep top 5 companies as the datasize is too huge for the LDA model.
top5=df.groupby('Company')['Complaint ID'].count().reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(5)
top5_company=top5.Company.to_list()
top5_df=df[df['Company'].isin(top5_company)]
In [12]:
top5_df['Date_received_dt'] = pd.to_datetime(top5_df['Date received'], format='%m/%d/%y')
top5_df['Date_sent_to_company_dt'] = pd.to_datetime(top5_df['Date sent to company'], format='%m/%d/%y')
In [13]:
top5_df['time_diff'] = top5_df['Date_sent_to_company_dt']-top5_df['Date_received_dt']
# The percentage of complaints that don't sent to the company on the same date.
len(top5_df[top5_df['time_diff']!='0 days'])/len(top5_df)*100
Out[13]:
4.590168615525193
In [14]:
top5_df.reset_index
top5_df=top5_df.set_index('Complaint ID')
top5_df['Date sent to company_dt'] =  pd.to_datetime(top5_df['Date sent to company'], format='%m/%d/%y')
#top10_df_2018bf=df[df['Date sent to company_dt'].dt.year<=2018]

Cleaning step¶

  • Make text all lower case
  • Remove punctuation
  • Remove numerical values
  • Remove common non-sensical text (/n)
  • Tokenize text
  • Remove stop words
  • Stemming / lemmatization
  • Parts of speech tagging
  • Create bi-grams or tri-grams
In [15]:
# Text cleaning - lowercase, remove punctuations and remove words with numbers. 

import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[x*\/]', '', text) #remove those dates with the format of XX/XX OR XXXX/XX/XX
    text = re.sub('x', '', text)#remove \n
    text = re.sub('\n', '', text)#remove \n
    return text

clean_step = lambda x: clean_text(x)
In [16]:
#Although we do not need to train and test in unsupervised, but we would like to leave some data out as unseen documents
df_train=top5_df.sample(frac=0.8, replace=False, random_state=42)
df_test=top5_df[~top5_df.index.isin(df_train.index)]
In [17]:
#We apply the clean step for the training dataset
data_clean = pd.DataFrame(df_train['Consumer complaint narrative'].apply(clean_step))
data_clean.head()
Out[17]:
Consumer complaint narrative
Complaint ID
4911306 ive submitted several disputes to the credit bureaus and to no avail have i received a response from the bureau s equifa and neither please help
3797983 eperian credit reporting agency also eperian information solutions inc has mis represented information on my credit report for several months now and i have been diligent in my efforts to aid them in fiing however i just found out today that i was rejected for a vital eidl aid because they reported an inaccurate score to the sba they are the direct and proimate cause of loss and injury to myself and phone records and recording will show that i tried many times to deal with the company on the matter however never gotten anywhere i am now seeking administrative remedy and will pursue punitive damages and recompensation for the harm cause to myself and by the information agency and request that the cfpb intercede as to ensure there is no reprisal or further mismanagement or my record as a result of future proceedings the damage is directly related to the fact that eperian did not publish my student loan information and thereby produce a credit score the student loans are federal loan managed by i spoke with and they assured me that they report the loan information to all three major bureaus to include eperian the information reports correctly on and but not eperian i request swift action be taken
5328667 this is my third time making a complaint about the accounts that are negatively affecting my credit report these accounts are false and inaccurate
5687883 i have submitted disputes to you and i dont get a response within days and you all are required by law to do so please investigate my disputes thank you
5520082 in accordance with the fair credit reporting act account original creditor account has violated my rights usc section states i have the right to privacy usc section a section it also states a consumer reporting agency can not furnish a account without my written instructions usc a creditor may not treat a payment on a credit card account under an open end consumer credit plan as late for and purpose
In [13]:
# Let's take a look at the wordcloud for our training dataset
long_string = ','.join(list(data_clean['Consumer complaint narrative'].values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
Out[13]:

Now, let's clean up the text, and then create bi-gram or trigrams as banks has lots of terms that should appear together.¶

In [148]:
#Let's remove stopwords, and based on the word cloud we saw, we add some additional stop word
stop_words = stopwords.words('english')
# I added some additional stop words based on the word cloud we just created.
stop_words.extend(['sent','told','received','said','asked','made','contacted','know','spoke','company','will','year','month'])
In [149]:
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
In [150]:
data = data_clean['Consumer complaint narrative'].values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0][:30])
['ive', 'submitted', 'several', 'disputes', 'to', 'the', 'credit', 'bureaus', 'and', 'to', 'no', 'avail', 'have', 'received', 'response', 'from', 'the', 'bureau', 'equifa', 'and', 'neither', 'please', 'help']
In [27]:
#Define some useful functions that could be used for data cleaning
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

#Define a function that creates bigrams
def bigrams(words, bi_min=15,threshold=100):
    bigram = gensim.models.Phrases(words, min_count = bi_min,threshold=threshold)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

#Define a function that creates trigrams
def trigrams(words, tri_min=15,threshold=100):
    bigram = gensim.models.Phrases(words, min_count=tri_min, threshold=threshold)
    trigram = gensim.models.Phrases(bigram[words],threshold=100)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return bigram_mod

#Define a function that perform lemmatization
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
In [ ]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
In [ ]:
# Form Bigrams
bigram_mod = bigrams(data_words_nostops)
bigrams = [bigram_mod[w] for w in data_words_nostops]
In [ ]:
#I commented the trigrams part out, as after reviewing the trigram output, I realized that there are not lot of trigram in our complaint narratives
# Form Trigrams
# tri_mod = trigrams(data_words_nostops)
# trigram = [tri_mod[w] for w in data_words_nostops]
In [30]:
%%time
spacy.prefer_gpu() # or spacy.require_gpu()
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
/home/fangyf/.local/lib/python3.9/site-packages/spacy/language.py:1895: UserWarning: [W123] Argument disable with value ['ner', 'parser'] is used instead of ['senter'] as specified in the config. Be aware that this might affect other components in your pipeline.
  warnings.warn(
CPU times: user 635 ms, sys: 44.2 ms, total: 679 ms
Wall time: 1.94 s

I separated the data lemmatization into four parts, this is due to the limited computation power, and if they were combined, the kernel may die out. With more powerful computing resource, this could be done in one step.¶

In [44]:
%%time
# Perform lemmatization keeping only noun, adj, vb, adv
data_lemmatized_bigrams = lemmatization(bigrams[:100000], allowed_postags=['NOUN', 'ADJ'])
print(data_lemmatized_bigrams[:1][0][:30])
['several', 'dispute', 'credit']
CPU times: user 10min 15s, sys: 1.9 s, total: 10min 17s
Wall time: 10min 21s
In [24]:
%%time
data_lemmatized_bigrams2 = lemmatization(bigrams[100000:200000], allowed_postags=['NOUN', 'ADJ'])
print(data_lemmatized_bigrams2[:1][0][:30])
['delete', 'information', 'file', 'item', 'report', 'victim', 'identity', 'theft', 'charge', 'ask', 'item', 'correct', 'credit', 'report', 'theft', 'identity', 'federal', 'trade_commission', 'copy', 'federal', 'trade_commission', 'identity', 'theft', 'affidavit', 'item', 'possible', 'hindering', 'advance', 'life', 'house']
CPU times: user 10min 4s, sys: 1.9 s, total: 10min 6s
Wall time: 10min 10s
In [25]:
%%time
data_lemmatized_bigrams3 = lemmatization(bigrams[200000:300000], allowed_postags=['NOUN', 'ADJ'])
print(data_lemmatized_bigrams3[:1][0][:30])
['complaint', 'identification', 'fa', 'number', 'work', 'shady', 'dishonest', 'refuse', 'information', 'inaccurate', 'pay', 'service', 'report']
CPU times: user 10min 33s, sys: 4.48 s, total: 10min 38s
Wall time: 10min 42s
In [26]:
%%time
data_lemmatized_bigrams4 = lemmatization(bigrams[300000:], allowed_postags=['NOUN', 'ADJ'])
print(data_lemmatized_bigrams4[:1][0][:30])
['card', 'day', 'day', 'representative', 'dispute', 'claim', 'credit', 'bureau', 'correct', 'error']
CPU times: user 9min 24s, sys: 2.33 s, total: 9min 26s
Wall time: 9min 30s
In [54]:
#Combine all the lemmatized bigrams into one
data_lemmatized_bigrams.extend(data_lemmatized_bigrams2)
data_lemmatized_bigrams.extend(data_lemmatized_bigrams3)
data_lemmatized_bigrams.extend(data_lemmatized_bigrams4)
In [18]:
#Save the bigrams
# with open('data_lemmatized_bigrams.txt', 'w') as f:
#     f.write(json.dumps(data_lemmatized_bigrams))

#Read back the bigrams
with open('data_lemmatized_bigrams.txt', 'r') as f:
    data_lemmatized_bigrams = json.loads(f.read())
In [19]:
%%time
#Now let's get the id2word, and corpus.
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized_bigrams)
id2word.filter_extremes(no_below=10, no_above=0.5)
id2word.compactify()
# Create Corpus
texts = data_lemmatized_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])
[(0, 1), (1, 1)]
CPU times: user 20.5 s, sys: 321 ms, total: 20.8 s
Wall time: 21 s
In [19]:
#Apply TFIDF Model
from gensim.models import TfidfModel
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In the following step, I created LDA model on both Bag of words and TFIDF corpus and later decide which one is better to use¶

In [40]:
%%time
#Run LDA on Bag of Words
lda_model_bow_lem_5comp = gensim.models.LdaMulticore(corpus, num_topics=10, id2word=id2word, passes=10,chunksize=100,
                           workers=7)
CPU times: user 2min 55s, sys: 29.8 s, total: 3min 25s
Wall time: 3min 15s
In [41]:
lda_model_bow_lem_5comp.save('lda_model_bow_lem_5comp.model')
with open('lda_model_bow_lem_5comp.pkl', 'wb') as f:
     pickle.dump(lda_model_bow_lem_5comp, f)
In [22]:
#load model 
lda_model_bow_lem_5comp = 'lda_model_bow_lem_5comp.pkl'
with open(lda_model_bow_lem_5comp, 'rb') as f:
    lda_model_bow_lem_5comp = pickle.load(f)  
    
#load id2word
id2word_dic='lda_model_bow_lem_5comp.model.id2word'
id2word_bow = corpora.Dictionary.load(id2word_dic)

# Recreate Corpus
texts = data_lemmatized_bigrams

# Term Document Frequency
corpus = [id2word_bow.doc2bow(text) for text in texts]
In [16]:
#Let's print out the top 10 topic model out of the BOW LDA model.
pprint(lda_model_bow_lem_5comp.print_topics())
[(0,
  '0.077*"claim" + 0.033*"collection" + 0.032*"reporting" + 0.027*"full" + '
  '0.025*"debt" + 0.024*"compliance" + 0.020*"knowledge" + 0.019*"compliant" + '
  '0.019*"document" + 0.015*"practice"'),
 (1,
  '0.037*"card" + 0.030*"chase" + 0.020*"bank" + 0.018*"money" + 0.017*"time" '
  '+ 0.015*"service" + 0.014*"phone" + 0.014*"charge" + 0.014*"call" + '
  '0.013*"check"'),
 (2,
  '0.084*"payment" + 0.049*"loan" + 0.036*"late" + 0.026*"score" + '
  '0.025*"time" + 0.020*"mortgage" + 0.017*"due" + 0.016*"year" + 0.012*"home" '
  '+ 0.011*"car"'),
 (3,
  '0.110*"identity" + 0.091*"theft" + 0.065*"fraudulent" + 0.042*"victim" + '
  '0.040*"item" + 0.027*"police" + 0.027*"information" + 0.021*"transunion" + '
  '0.019*"score" + 0.015*"record"'),
 (4,
  '0.084*"information" + 0.040*"letter" + 0.039*"dispute" + 0.034*"eperian" + '
  '0.031*"inaccurate" + 0.026*"reporting" + 0.023*"complaint" + 0.022*"item" + '
  '0.022*"investigation" + 0.021*"law"'),
 (5,
  '0.108*"inquiry" + 0.043*"name" + 0.040*"information" + 0.038*"address" + '
  '0.037*"number" + 0.029*"bankruptcy" + 0.020*"file" + 0.018*"security" + '
  '0.017*"unauthorized" + 0.016*"company"'),
 (6,
  '0.157*"balance" + 0.135*"date" + 0.032*"charge" + 0.031*"status" + '
  '0.029*"amount" + 0.027*"acct" + 0.024*"creditor" + 0.023*"last" + '
  '0.022*"due" + 0.021*"payment"'),
 (7,
  '0.113*"day" + 0.077*"equifa" + 0.064*"item" + 0.037*"letter" + '
  '0.035*"thing" + 0.035*"breach" + 0.035*"dispute" + 0.034*"datum" + '
  '0.034*"information" + 0.033*"response"'),
 (8,
  '0.156*"consumer" + 0.096*"section" + 0.066*"agency" + 0.062*"right" + '
  '0.061*"information" + 0.053*"reporting" + 0.032*"fair" + 0.027*"privacy" + '
  '0.026*"accordance" + 0.025*"act"'),
 (9,
  '0.062*"debt" + 0.038*"creditor" + 0.037*"collection" + 0.026*"original" + '
  '0.025*"violation" + 0.023*"consumer" + 0.018*"law" + 0.017*"information" + '
  '0.017*"contract" + 0.016*"proof"')]
In [ ]:
#Run LDA on TF-IDF
lda_model_tfidf_lem_5 = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=id2word, passes=10,chunksize=100,
                           workers=7)
In [ ]:
#This step saved the model to the specified location
lda_model_tfidf_lem_5.save('lda_model_tfidf_lem_5.model')
with open('lda_model_tfidf_lem_5.pkl', 'wb') as f:
     pickle.dump(lda_model_tfidf_lem_5, f)
In [23]:
#load model
from gensim.models import TfidfModel 
lda_model_tfidf_lem_5 = 'lda_model_tfidf_lem_5.pkl'
with open(lda_model_tfidf_lem_5, 'rb') as f:
    lda_model_tfidf_lem_5 = pickle.load(f)  
In [24]:
#load id2word
id2word_dic_tfidf='lda_model_tfidf_lem_5.model.id2word'
id2word_tfidf = corpora.Dictionary.load(id2word_dic_tfidf)


# Term Document Frequency
corpus = [id2word_bow.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
In [21]:
#Let's print out the top 10 topic model out of the TFIDF LDA model.
pprint(lda_model_tfidf_lem_5.print_topics())
[(0,
  '0.119*"inquiry" + 0.040*"unauthorized" + 0.034*"hard" + 0.026*"inquire" + '
  '0.021*"company" + 0.018*"personal" + 0.016*"fraudulent" + '
  '0.016*"transunion" + 0.014*"date" + 0.013*"information"'),
 (1,
  '0.022*"collection" + 0.015*"debt" + 0.012*"medical" + 0.012*"claim" + '
  '0.010*"information" + 0.009*"possible" + 0.009*"compliance" + '
  '0.008*"compliant" + 0.007*"bill" + 0.007*"showing"'),
 (2,
  '0.031*"complaint" + 0.030*"filing" + 0.026*"d" + 0.021*"party" + '
  '0.020*"false" + 0.016*"behalf" + 0.014*"deposit" + 0.014*"boa" + '
  '0.012*"third" + 0.010*"error"'),
 (3,
  '0.033*"payment" + 0.025*"late" + 0.022*"loan" + 0.018*"balance" + '
  '0.016*"score" + 0.011*"date" + 0.010*"due" + 0.010*"status" + '
  '0.009*"mortgage" + 0.009*"full"'),
 (4,
  '0.044*"identity" + 0.042*"theft" + 0.029*"fraudulent" + 0.028*"victim" + '
  '0.019*"name" + 0.018*"police" + 0.015*"security" + 0.015*"fraud" + '
  '0.015*"item" + 0.014*"social"'),
 (5,
  '0.019*"consumer" + 0.015*"debt" + 0.014*"information" + 0.012*"violation" + '
  '0.011*"law" + 0.010*"fcra" + 0.010*"proof" + 0.009*"file" + 0.009*"agency" '
  '+ 0.009*"reporting"'),
 (6,
  '0.130*"section" + 0.101*"right" + 0.068*"privacy" + 0.064*"furnish" + '
  '0.063*"instruction" + 0.059*"consumer" + 0.058*"accordance" + 0.048*"fair" '
  '+ 0.047*"agency" + 0.043*"act"'),
 (7,
  '0.022*"balance" + 0.022*"dispute" + 0.021*"bankruptcy" + 0.021*"letter" + '
  '0.021*"information" + 0.017*"eperian" + 0.016*"inaccurate" + '
  '0.015*"incorrect" + 0.015*"address" + 0.015*"bureaus"'),
 (8,
  '0.017*"chase" + 0.017*"card" + 0.011*"bank" + 0.010*"money" + 0.008*"phone" '
  '+ 0.008*"check" + 0.008*"charge" + 0.008*"time" + 0.008*"fee" + '
  '0.008*"service"'),
 (9,
  '0.045*"day" + 0.044*"item" + 0.036*"litigation" + 0.033*"unknown" + '
  '0.033*"thing" + 0.030*"breach" + 0.030*"inaccurate" + 0.026*"wrong" + '
  '0.025*"stress" + 0.024*"later"')]
In [27]:
#Visualize the key words in each topics
# I get the idea of this visualization from a post: https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model_tfidf_lem_5.show_topics(formatted=False)

fig, axes = plt.subplots(2, 5, figsize=(15,15), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()
Matplotlib is building the font cache; this may take a moment.
In [25]:
#Visualize the bag of words with top 10 topics using pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model_bow_lem_5comp, corpus, id2word)
pyLDAvis.display(vis)
Out[25]:
In [26]:
#Visualize the tfidf with top 10 topics using pyLDAvis
vis2 = pyLDAvis.gensim_models.prepare(lda_model_tfidf_lem_5, corpus_tfidf, id2word)
pyLDAvis.display(vis2)
Out[26]:

Next, let's see how the models perform, and pick the better model based on coherence value.¶

In [66]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=num_topics,chunksize=100,
                           workers=7)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values
In [32]:
%%time
coherencemodel_bow = CoherenceModel(model=lda_model_bow_lem_5comp, corpus=corpus, dictionary=id2word, coherence='u_mass')
coherencemodel_bow.get_coherence()
Out[32]:
-1.9108488229692984
In [64]:
%%time
coherencemodel_tfidf = CoherenceModel(model=lda_model_tfidf_lem_5, corpus=corpus_tfidf, dictionary=id2word, coherence='u_mass')
coherencemodel_tfidf.get_coherence()
CPU times: user 35.6 s, sys: 188 ms, total: 35.8 s
Wall time: 35.9 s
Out[64]:
-2.2466021740921223
In [67]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=2, limit=15, step=3)
# Show graph
import matplotlib.pyplot as plt
limit=16; start=2; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
In [42]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus_tfidf, texts=texts, start=2, limit=15, step=3)
# Show graph
import matplotlib.pyplot as plt
limit=16; start=2; step=3;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
In [46]:
#tfidf
best_result_index = coherence_values.index(min(coherence_values))
optimal_model = model_list[best_result_index]
# Select the model and print the topics
model_topics = optimal_model.show_topics(formatted=False)
print(f'''The {x[best_result_index]} topics gives the highest coherence score \\
of {coherence_values[best_result_index]}''')
The 14 topics gives the highest coherence score \
of -2.496682136236564
In [69]:
#bag of words
best_result_index = coherence_values.index(min(coherence_values))
optimal_model = model_list[best_result_index]
# Select the model and print the topics
model_topics = optimal_model.show_topics(formatted=False)
print(f'''The {x[best_result_index]} topics gives the highest coherence score \\
of {coherence_values[best_result_index]}''')
The 14 topics gives the highest coherence score \
of -1.9218424997696348

Based on the umass coherence value, the lower the better. And both BOW or TFID has lower coherence value with more topics. And it makes sens that we pick a turning point of the number of topics.¶

Now, let's apply our model to test dataset¶

In [23]:
#First, we have to pre-processing the test dataset.
data_clean_test = pd.DataFrame(df_test['Consumer complaint narrative'].apply(round1))
data_test = data_clean_test['Consumer complaint narrative'].values.tolist()
data_words_test = list(sent_to_words(data_test))

print(data_words_test[:1][0][:30])
['recently', 'in', 'of', 'this', 'year', 'my', 'wife', 'and', 'paid', 'off', 'over', 'in', 'credit', 'card', 'balances', 'in', 'doing', 'so', 'my', 'credit', 'score', 'went', 'for', 'to', 'with', 'no', 'other', 'items', 'being', 'reported']
In [24]:
data_words_nostops_test = remove_stopwords(data_words_test)
bigram_mod_test = bigrams(data_words_nostops_test)
bigrams_test = [bigram_mod_test[w] for w in data_words_nostops_test]
In [26]:
%%time
spacy.prefer_gpu() # or spacy.require_gpu()
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])
data_lemmatized_bigrams_test = lemmatization(bigrams_test, allowed_postags=['NOUN', 'ADJ'])
print(data_lemmatized_bigrams_test[:1][0][:30])
['year', 'wife', 'credit', 'card', 'balance', 'credit', 'score', 'item', 'credit', 'score', 'minute', 'phone', 'possible', 'piece', 'personal', 'information', 'identity', 'verification', 'process', 'submit', 'identification', 'writing', 'credit', 'score', 'credit', 'card', 'credit', 'hostage', 'reason', 'impossible']
CPU times: user 10min 51s, sys: 1.67 s, total: 10min 53s
Wall time: 10min 57s
In [28]:
#Save the bigrams
with open('data_lemmatized_bigrams_test.txt', 'w') as f:
    f.write(json.dumps(data_lemmatized_bigrams_test))

#Load back the bigrams if you want to directly see the results
# with open('data_lemmatized_bigrams_test.txt', 'r') as f:
#     data_lemmatized_bigrams_test = json.loads(f.read())
In [29]:
id2word_test = corpora.Dictionary(data_lemmatized_bigrams_test)
id2word_test.filter_extremes(no_below=10, no_above=0.5)
id2word_test.compactify()
# Create Corpus
texts_test = data_lemmatized_bigrams_test

# Term Document Frequency
corpus_test = [id2word_test.doc2bow(text) for text in texts_test]

# View
print(corpus_test[:1][0][:30])
[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 3), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]
In [33]:
# I referenced this function from post - https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# This function helps to create a dataframe that has dominant topic and the percentation contribution and key word for each unseen document.
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)
In [221]:
start=0
stop=20
# I created this function based on the idea of the function format_topics_sentences above.
# Similarly, this function helps to create a dataframe that has dominant topic and the percentation contribution and key word for each all the trained documents.

def get_documents_topic(ldamodel=None,corpus=corpus,texts=texts,range_start=start,range_stop=stop):
    # docu_topics_df = pd.DataFrame()
    df_train1=df_train.copy()
    df_train1=df_train1
    df_train1[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords','Text']]= ''
    for i in range(range_start,range_stop):
        topic,prob=sorted(ldamodel.get_document_topics(corpus_tfidf[i]), key=lambda x: (x[1]), reverse=True)[0]
        wp = ldamodel.show_topic(topic)
        topic_keywords = ", ".join([word for word, prop in wp])
        df_train1['Dominant_Topic'].iloc[i]= topic
        df_train1['Perc_Contribution'].iloc[i] = prob
        df_train1['Topic_Keywords'].iloc[i] = topic_keywords
        #contents = pd.Series(texts)
        df_train1['Text'].iloc[i] = texts[i]
    
    return df_train1.iloc[range_start:range_stop]
In [190]:
%%time
#Again, due to limited computing resource, I had to split this step into multiple steps to avoid that the kernal dies in the middle of the process.
start=0
stop=100000
train_result=get_documents_topic(ldamodel=lda_model_tfidf_lem_5,corpus=corpus_tfidf,texts=texts,range_start=start,range_stop=stop)
/sw/pkgs/arc/python3.9-anaconda/2021.11/lib/python3.9/site-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
CPU times: user 6min 37s, sys: 1.59 s, total: 6min 39s
Wall time: 6min 41s
In [ ]:
#Save the result just in case
train_result.to_csv('train_result1.csv')
In [224]:
%%time
start=100000
stop=200000
train_result2=get_documents_topic(ldamodel=lda_model_tfidf_lem_5,corpus=corpus_tfidf,texts=texts,range_start=start,range_stop=stop)
/sw/pkgs/arc/python3.9-anaconda/2021.11/lib/python3.9/site-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
CPU times: user 13min 20s, sys: 1.66 s, total: 13min 22s
Wall time: 13min 26s
In [226]:
#Save the result just in case
train_result2.to_csv('train_result2.csv')
In [227]:
%%time
start=200000
stop=300000
train_result3=get_documents_topic(ldamodel=lda_model_tfidf_lem_5,corpus=corpus_tfidf,texts=texts,range_start=start,range_stop=stop)
CPU times: user 13min 21s, sys: 2.09 s, total: 13min 24s
Wall time: 13min 28s
In [228]:
#Save the result just in case
train_result3.to_csv('train_result3.csv')
In [229]:
%%time
start=300000
stop=len(df_train1)
train_result4=get_documents_topic(ldamodel=lda_model_tfidf_lem_5,corpus=corpus_tfidf,texts=texts,range_start=start,range_stop=stop)
CPU times: user 12min 10s, sys: 1.79 s, total: 12min 12s
Wall time: 12min 16s
In [230]:
#Save the result just in case
train_result4.to_csv('train_result4.csv')
In [231]:
#concatenate all trained results
frames = [train_result, train_result2, train_result3,train_result4]
train_results = pd.concat(frames)
train_results.to_csv('train_results_combined.csv')
In [35]:
#Read in the result directly
with zipfile.ZipFile('train_results_combined.csv.zip') as zip:
    with zip.open('train_results_combined.csv') as myZip:
        train_result = pd.read_csv(myZip) 
Here's the list of topics summarized based on the key words in each topic¶
  • Topic 0 : Unauthorized hard inquiry
  • Topic 1: Debt Collection
  • Topic 2: Third Party Error
  • Topic 3: Late Payment
  • Topic 4: Identity theft and Fraud
  • Topic 5: Inaccurate Information, Violation of FCRA
  • Topic 6: Privacy
  • Topic 7: Dispute ina Balance
  • Topic 8: Charge fee
  • Topic 9: Litigation, Breach of contract
In [98]:
#Let's quickly see how many complaints we have by each dominant topic in our training results.
train_result.groupby(['Dominant_Topic'])['Complaint ID'].count().reset_index(name='count') \
                             .sort_values(['count'], ascending=False) 
Out[98]:
Dominant_Topic count
8 8 78324
5 5 65998
7 7 62466
3 3 62215
4 4 39250
0 0 29823
6 6 24833
9 9 15250
1 1 9112
2 2 4959

Now, let's take a quick look at how the dominant topic distribution is like in different companies.¶

In [36]:
df_company_Topic=train_result.groupby(['Company','Dominant_Topic'])['Complaint ID'].count().reset_index(name='count') \
                             .sort_values(['count'], ascending=False) 
df_company_Topic['Percentage']=df_company_Topic.groupby(['Company'])['count'].apply(lambda x: x*100/x.sum())
In [37]:
for i in list(df_company_Topic.Company.unique()):
    Company_df=df_company_Topic[df_company_Topic['Company']==i]
   
    fig=px.bar(Company_df,x='Dominant_Topic',y='Percentage',title=f'Company:{i}')
    fig.show() 

Let's then apply the model on unseen documents that we excluded from our training process.¶

In [34]:
%%time
#Produce the results for unseen model
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model_tfidf_lem_5, corpus=corpus_test, texts=texts_test)
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)
df_dominant_topic.to_csv('results on unseen documents.csv')
In [152]:
#Load the csv - run this if you want to load the result directly.
df_dominant_topic=pd.read_csv('results on unseen documents.csv')
In [138]:
%%time
#Make a copy of the test dataset and get it ready to merge with our topics modeling result
df_test1=df_test.copy()
df_test1.reset_index(inplace=True)
#Merge the topics to the original df test dataframe.
combined_unseen_documents=pd.merge(df_test1, df_dominant_topic, on=None, left_index=True,right_index=True)
#Save the result to csv
combined_unseen_documents.to_csv('combined_unseen_documents.csv')
CPU times: user 10.5 ms, sys: 10.9 ms, total: 21.4 ms
Wall time: 20.6 ms

Let's create a word cloud for each topic, based on the frequency of the word in each topic. The higher the frequency, the larger the size of the term.¶

Next, I am interested to learn more about how the terms and words change or evolve throughout the period 2015 to 2022. We will leverage ldaseqmodel for this task.¶

In [34]:
# Due to limited computing resource, I only picked 2% of our training dataset for this analysis.
df_train_dtm=df_train.groupby(df_train['Date_received_dt'].dt.year).sample(frac=0.02, replace=False, random_state=42)
In [41]:
#save the sampled data
df_train_dtm.to_csv('sampled_train_ldaseq.csv')
In [21]:
index = pd.Index(df_train_dtm['Date_received_dt'].dt.year)
index.value_counts()  #time_stices should be changed based on counts
Out[21]:
2022    2193
2021    1504
2020    1386
2019     845
2018     685
2017     657
2016     337
2015     237
Name: Date_received_dt, dtype: int64
In [22]:
data_clean_dtm = pd.DataFrame(df_train_dtm['Consumer complaint narrative'].apply(round1))
data_clean_dtm.head()
Out[22]:
Consumer complaint narrative
Complaint ID
1565911 transunion credit reporting agency has received from on my was submitted paidoff of my account number for this vehicle loan loan number has reported this vehicle loan paid off for several months but transunion s efforts of attempting to compromise and prejudice my credit score as an consumer retiree author activist advocate veteran fighter for justice for as an federal agency would cfpb instruct transunion in showing my loan number with as paidoff also inform transunion of my goal of making them pay for continuous races and discriminatory practices against my credit file in federal district court in ga as soon as the va release my retroactive entitlements citing nearly a dollars lastly transunion if you want to play let s play but at the end liabilities will be resolve for with my credit report do you get it again the balance of auto loan with was paidoff of and reported by as transunion is very much aware i spoke with and with in their reporting department statement is that they ve reported this vehicle as being paid off in there s transmission to the bureaus transunion i m going to sue you stop with the racism leave my credit file along i ll show you folks transunion in federal court i see what you folks are attempting and notice i said attempting cfpb thanks for going after these crooks transunion i will keep fighting for justice for as consumers
1619063 bank of america is inaccurately and incompletely furnishing information regardingthe one and only bank of america account listed on my report to the credit reporting agencies inviolation of the reporting requirements of the fcra section responsibilities of furnishers of information and for which bofa is responsible i dispute their information in its entirety and request evidentiary documentation that substantiates the information they have furnished and its reporting
1293340 i had a account with that was eventually sent to a collection agency it has been on all of my credit files for over years with collection agency and as of it had been resold to another collection agency and is being report as a new collection account on my credit file as of that date the account should have been removed from my credit file in for good the collection agency is wa and its for account for i feel that the agency is unfairly putting fraudulent info on my credit report
1450859 on or about i received an approval letter from my senior lien holder and immediately forwarded it over to the junior lien holder chase i have received several etensions from the senior lien holder and i have informed chase that i can not receive another etension this latest etension epires on i have phoned emailed and sent messages through equator and still no result on my approval letter my file has sat with chase for over a month and was not touched it took months to get my address corrected it is as if the bank is pushing me into a foreclosure situation i see no evidence of them doing what the government says the lenders should to provide foreclosure alternatives to us as homeowners i have tried as hard a i can to make this work and the bank is being unreasonable so due to their inactivity and unwillingness to assist me i have reached out to you i am desperate for assistance
1541365 i submitted an identical complaint against chase in for unauthorized interest charges and after your involvment they resolved it the case number was however chase has done the same unlawful act of charging interest after receving payment i had a statement balance which was due on on i had made payments totaling i was blocked from making the final payment by a hour payment lock and i am willing to pay interest on that but not on a balance that was paid i reached out to chase to resolve the issue without cfpb s involvement but my request denied without any eplanation i am an and now that they have done this a second time i am considering filing a unlawful business practices lawsuit under business professions code et seq i also think chase is engaging in this kind of behavior routinely of false interest charges and needs to be kept on a tighter leash all documentation showing the payments were made is attached the total interest charged was based on the balance which was paid there was a balance on my statement and i was willing to pay the approimately on that but now that chase has made me to go to cfpb again for bogus charges i would like to pay no interest more importantly other consumers need to be protected from these type of unlawful charges
In [25]:
#Save the list of words
data_dtm = data_clean_dtm['Consumer complaint narrative'].values.tolist()
data_words_dtm = list(sent_to_words(data_dtm))

print(data_words_dtm[:1][0][:30])
['transunion', 'credit', 'reporting', 'agency', 'has', 'received', 'from', 'on', 'my', 'was', 'submitted', 'paidoff', 'of', 'my', 'account', 'number', 'for', 'this', 'vehicle', 'loan', 'loan', 'number', 'has', 'reported', 'this', 'vehicle', 'loan', 'paid', 'off', 'for']
In [28]:
#Remove stopwords and convert to bigrams
data_words_nostops_dtm = remove_stopwords(data_words_dtm)
bigram_mod_dtm = bigrams(data_words_nostops_dtm)
bigrams_dtm = [bigram_mod_dtm[w] for w in data_words_nostops_dtm]
In [31]:
#lemmatization on bigrams
data_lemmatized_bigrams_dtm = lemmatization(bigrams_dtm, allowed_postags=['NOUN', 'ADJ'])
print(data_lemmatized_bigrams_dtm[:1][0][:30])
['transunion', 'credit', 'reporting', 'agency', 'paidoff', 'account', 'number', 'vehicle', 'loan', 'loan', 'number', 'vehicle', 'loan', 'several', 'month', 'transunion', 'effort', 'compromise', 'prejudice', 'credit', 'score', 'consumer', 'retiree', 'author', 'activist', 'advocate', 'veteran', 'fighter', 'federal', 'transunion']
In [40]:
#Save the bigrams
with open('data_lemmatized_bigrams_dtm.txt', 'w') as f:
    f.write(json.dumps(data_lemmatized_bigrams_dtm))
In [30]:
with open('data_lemmatized_bigrams_dtm.txt', 'r') as f:
    data_lemmatized_bigrams_dtm = json.loads(f.read())
In [31]:
id2word_dtm = corpora.Dictionary(data_lemmatized_bigrams_dtm)
id2word_dtm.filter_extremes(no_below=10, no_above=0.5)
id2word_dtm.compactify()
# Create Corpus
texts_dtm = data_lemmatized_bigrams_dtm

# Term Document Frequency
corpus_dtm = [id2word_dtm.doc2bow(text) for text in texts_dtm]

# View
print(corpus_dtm[:1][0][:30])
tfidf_dtm = TfidfModel(corpus_dtm)
corpus_tfidf_dtm = tfidf_dtm[corpus_dtm]
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 2), (15, 1), (16, 1), (17, 5), (18, 1), (19, 1), (20, 3), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 9)]
In [37]:
#time_slice = [50,50,50,50,50,50,50,50]
time_slice=[237,337,657,685,845,1386,1504,2193]
years=[2015,2016,2017,2018,2019,2020,2021,2022]
num_topics = 10
In [38]:
%%time
ldaseq_test = ldaseqmodel.LdaSeqModel(corpus=corpus_tfidf_dtm, id2word=id2word_dtm, time_slice=time_slice, num_topics=num_topics)
ldaseq_test.save('ldaseq_test.model')
with open('ldaseq_test.pkl', 'wb') as f:
     pickle.dump(ldaseq_test, f)
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:297: RuntimeWarning: divide by zero encountered in double_scalars
  convergence = np.fabs((bound - old_bound) / old_bound)
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
/home/fangyf/.local/lib/python3.9/site-packages/gensim/models/ldaseqmodel.py:1478: RuntimeWarning: invalid value encountered in double_scalars
  converged = np.fabs((lhood_old - lhood) / (lhood_old * total))
CPU times: user 48min 48s, sys: 2min 39s, total: 51min 28s
Wall time: 48min 31s
In [28]:
model2 = 'ldaseq_test.pkl'
with open(model2, 'rb') as f:
    model2 = pickle.load(f)
In [32]:
doc_topic, topic_term, doc_lengths, term_frequency, vocab = model2.dtm_vis(time=0, corpus=corpus_tfidf_dtm)
In [33]:
vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)
pyLDAvis.display(vis_dtm)
Out[33]:
In [10]:
#Turn topics into dataframe for future visualization
num_topics=10
topic_word_trend = pd.DataFrame()
for i in range(num_topics):
    result_topic=model2.print_topic_times(topic=i)
    df_result_topic=pd.DataFrame([t for lst in result_topic for t in lst[:5]])
    df_result_topic['Year']=(df_result_topic.index/5+2015).astype(int)
    df_result_topic['Topic']=str(i)
    topic_word_trend=topic_word_trend.append(df_result_topic,ignore_index=True)
topic_word_trend.rename(columns={0:'Term',1:'Frequency'},inplace=True)
In [127]:
#Turn topics into dataframe for future visualization
topic_word_trend = pd.DataFrame()
for i in range(num_topics):
    result_topic=model2.print_topic_times(topic=i)
    df_result_topic=pd.DataFrame([t for lst in result_topic for t in lst[:5]])
    df_result_topic['Year']=(df_result_topic.index/5+2015).astype(int)
    df_result_topic['Topic']=str(i)
    topic_word_trend=topic_word_trend.append(df_result_topic,ignore_index=True)
topic_word_trend.rename(columns={0:'Term',1:'Frequency'},inplace=True)
In [89]:
#Save the result for easier future visualization
topic_word_trend.to_csv('Dynamic_topic_model.csv')
In [88]:
num_topics=10
for i in range(num_topics):
    topic_df=topic_word_trend[topic_word_trend['Topic']==str(i)]
    fig=px.line(topic_df,x=topic_df['Year'],y='Frequency',color='Term',title=f'Topic:{i}')
    fig.show()
    
    
In [ ]: